Use output from Humann2 to assess the gene families, pathway abundance, and pathway coverage, assigned to Gardnerella vaginalis across samples.

Setup

Packages

library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggrepel)

File paths

Metadata

sgDF <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/shotgunMetadata.tsv", delim = "\t")%>%
  mutate(SampleID=as.character(SampleID))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   SampleIndex = col_character(),
##   SampleType = col_character(),
##   TermPre = col_character(),
##   Operator = col_character(),
##   PctTrim = col_character(),
##   RNAlater = col_character(),
##   Cohort = col_character()
## )
## See spec(...) for full column specifications.
load("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/methods_comparisons/PS_RData/metaphlan_PS.RData")

Gardnerella abundance table

#Clade assignments
gardRelativeAbundance <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/gard_mapping/clade_assignments/gardRelativeAbundance.tsv", delim = "\t") %>%
  mutate(SampleID=as.character(SampleID),
         Clade=as.character(Clade))
## Parsed with column specification:
## cols(
##   SampleID = col_double(),
##   Clade = col_double(),
##   n = col_double(),
##   propClade = col_double()
## )
SampleIDlist <- rep(unique(sgDF$SampleID), 6) %>%
  .[order(.)]

metaphlanGard <- PS_metaphlan_all@otu_table %>%
  t() %>% 
  as.data.frame() %>%
  mutate(SampleID=rownames(.)) %>%
  select(Gardnerella_vaginalis, SampleID)
## Loading required package: phyloseq
cladeFrame <- tibble(SampleID=SampleIDlist,
                     Clade=rep(c("1","2", "3", "4", "5", "6"), 107),
                     n=0, 
                     propClade=0)

gardRelativeAbundance0 <- gardRelativeAbundance %>%
 full_join(cladeFrame) %>%
 group_by(SampleID, Clade) %>%
 filter(n==max(n)) %>%
 ungroup() %>%
 select(-n) %>%
 spread(Clade, propClade) %>%   
 mutate(Clades=case_when((`1`>0 |`2`>0) & `3`==0 & `4`==0 & `5`==0 & `6`==0 ~ "1_2", 
                         `1`==0 & `2`==0 & (`3`>0 | `4`>0) & `5`==0 & `6`==0 ~ "3_4",
                         `1`==0 & `2`==0 & `3`==0 & `4`==0 & (`5`>0 | `6`>0) ~ "5_6",
                         `1` > 0 | `2` > 0 | `3`>0 | `4`>0 | `5`>0 | `6`>0 ~ "mix",
                         `1`==0 & `2` ==0 & `3`==0 & `4`==0 & `5`==0 & `6`==0 ~"none")) %>%
 left_join(metaphlanGard, by="SampleID") %>%
 left_join(unique.data.frame(sgDF[,c("SampleID", "SubjectID", "TermPre", "GWdell", "Cohort", "GWcoll")])) %>%
 mutate(SubjectID=as.character(SubjectID)) 
## Joining, by = c("SampleID", "Clade", "n", "propClade")
## Joining, by = "SampleID"

Humann2 Output tables

#humann2 outs
dataDir <- "/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables"
GF_path <- file.path(dataDir, "VM_genefamilies_names_cpm.tsv")
GFP_path <- file.path(dataDir, "VM_genefamilies_paths_cpm.tsv")
GOnames_path <- file.path(dataDir, "map_go_name.txt")
GFGO_path <- file.path(dataDir, "map_go_uniref90.txt")
PWnames_path <- file.path(dataDir, "map_metacyc-pwy_name.txt") 
PC_path <- file.path(dataDir, "VM_pathcoverage.tsv")
PA_path <- file.path(dataDir, "VM_pathabundance_cpm.tsv")

# read in data frames
GF <- read_delim(GF_path, delim = "\t") # Gene family abundances
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `# Gene Family` = col_character()
## )
## See spec(...) for full column specifications.
GFP <- read_delim(GFP_path, delim = "\t") # pathways for each gene family
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `1000801248_Abundance` = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 1662 parsing failures.
## row col    expected    actual                                                                                                                  file
##   1  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   2  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   3  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   4  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   5  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
## ... ... ........... ......... .....................................................................................................................
## See problems(...) for more details.
PWnames <- read_delim(PWnames_path, delim = "\t", col_names = c("Pathway", "pathwayAnnotation"))
## Parsed with column specification:
## cols(
##   Pathway = col_character(),
##   pathwayAnnotation = col_character()
## )
PC <- read_delim(PC_path, delim = "\t") # pathway coverage 
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `# Pathway` = col_character()
## )
## See spec(...) for full column specifications.
PA <- read_delim(PA_path, delim = "\t") # pathway abundnance
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `# Pathway` = col_character()
## )
## See spec(...) for full column specifications.
GOnames <- read_delim(GOnames_path, delim = "\t", col_names = c("GO", "Function")) # names and annotations of GO categories
## Parsed with column specification:
## cols(
##   GO = col_character(),
##   Function = col_character()
## )
GFGO <- read_lines(GFGO_path) %>% # read in the file as a list
  str_split("\t")

Get Gardnerella genes/pathways only

Re-format tables for dplyr

Paths for gene families

GFP0 <- GFP %>%
  select(`# Pathway`) %>% # keep only pathway and gene family names
  filter(str_detect(`# Pathway`, "Gardnerella")) %>% # keep only Gardnerella
  separate(`# Pathway`, c("Pathway", "GeneFamily"), sep ="\\|g__Gardnerella.s__Gardnerella_vaginalis\\|", fill = "right") %>% # separate 
  separate(GeneFamily, c("uniref", "unirefAnnotation"), ": ") %>%
  filter(uniref != "") # remove empty pathways
head(GFP0)
## # A tibble: 6 x 3
##   Pathway        uniref          unirefAnnotation                      
##   <chr>          <chr>           <chr>                                 
## 1 COA-PWY-1      UniRef90_I4LNX4 Dephospho-CoA kinase                  
## 2 COA-PWY-1      UniRef90_D2R9V1 Phosphopantetheine adenylyltransferase
## 3 COA-PWY-1      UniRef90_D2RAK6 Dephospho-CoA kinase                  
## 4 COA-PWY-1      UniRef90_D2RBT1 Type III pantothenate kinase          
## 5 NONOXIPENT-PWY UniRef90_E3D9X8 Transaldolase                         
## 6 NONOXIPENT-PWY UniRef90_D6SZM2 Transaldolase

GO categories for gene family

GOcategories <- map(GFGO, `[[`, 1) %>% # get the GO category labels
  unlist()
GFGO0 <- map2(GOcategories, GFGO, ~paste(.x, .y, sep=",")) %>% # paste a separable (important for later making a dataframe) GO category to each gene family in each category
  unlist() %>% # collapse list
  as_tibble() %>% # make dataframe
  separate(1, c("GO", "uniref"), ",") %>% #separate into columns
  filter(!str_detect(uniref, "GO:[0-9]*$")) # remove rows with GO category label in uniref column
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
head(GFGO0)
## # A tibble: 6 x 2
##   GO         uniref             
##   <chr>      <chr>              
## 1 GO:0000001 UniRef90_A0A016PS51
## 2 GO:0000001 UniRef90_A1C8D3    
## 3 GO:0000001 UniRef90_A1CBR9    
## 4 GO:0000001 UniRef90_A1CE31    
## 5 GO:0000001 UniRef90_A1CNY1    
## 6 GO:0000001 UniRef90_A1CPB1

Gene Family

GF0 <- GF %>%
  filter(str_detect(`# Gene Family`, "Gardnerella")) %>%
  as.data.frame() 
rownames(GF0) <- GF0[,1]
GF1 <- GF0 %>%
  select(-`# Gene Family`)
GF2 <- GF1 %>%
  t() %>%
  as.data.frame(strinsAsFactors=FALSE) %>%
  mutate(SampleID=str_extract(colnames(GF1), "[0-9]{10}")) %>%
  gather("GeneFamily", "Abundance", contains("Gardnerella")) %>%
  mutate(GeneFamily=str_extract(GeneFamily, ".*(?=.g__Gardnerella)")) %>%
  separate(GeneFamily, c("uniref", "unirefAnnotation"), ": ", fill = "right") %>%
  left_join(gardRelativeAbundance0[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort", "Clades")], by="SampleID")
head(GF2)
##     SampleID           uniref unirefAnnotation Abundance SubjectID TermPre
## 1 1000801248 UniRef90_unknown             <NA>   293.681     10008       T
## 2 1000801318 UniRef90_unknown             <NA>   168.545     10008       T
## 3 1000801368 UniRef90_unknown             <NA>   262.059     10008       T
## 4 1001301158 UniRef90_unknown             <NA>  5946.180     10013       P
## 5 1001301248 UniRef90_unknown             <NA> 11876.100     10013       P
## 6 1001301318 UniRef90_unknown             <NA>   807.692     10013       P
##   GWdell GWcoll   Cohort Clades
## 1     41     25 Stanford    3_4
## 2     41     32 Stanford    3_4
## 3     41     37 Stanford    3_4
## 4     35     16 Stanford    mix
## 5     35     25 Stanford    mix
## 6     35     31 Stanford    mix

Pathway Abundance

PA0 <- PA %>%
  filter(str_detect(`# Pathway`, "Gardnerella")) %>%
  as.data.frame() 
rownames(PA0) <- PA0[,1]
PA1 <- PA0 %>%
  select(-`# Pathway`)
PA2 <- PA1 %>%
  t() %>%
  as.data.frame(strinsAsFactors=FALSE) %>%
  mutate(SampleID=str_extract(colnames(PA1), "[0-9]{10}")) %>%
  gather("Pathway", "Abundance", contains("Gardnerella")) %>%
  mutate(Pathway=str_extract(Pathway, ".*(?=.g__Gardnerella)")) %>%
  left_join(gardRelativeAbundance0[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort", "Clades")], by="SampleID")
head(PA2)
##     SampleID      Pathway Abundance SubjectID TermPre GWdell GWcoll
## 1 1000801248 UNINTEGRATED   5507.21     10008       T     41     25
## 2 1000801318 UNINTEGRATED   3295.92     10008       T     41     32
## 3 1000801368 UNINTEGRATED   4797.42     10008       T     41     37
## 4 1001301158 UNINTEGRATED  73881.90     10013       P     35     16
## 5 1001301248 UNINTEGRATED 174983.00     10013       P     35     25
## 6 1001301318 UNINTEGRATED  15372.60     10013       P     35     31
##     Cohort Clades
## 1 Stanford    3_4
## 2 Stanford    3_4
## 3 Stanford    3_4
## 4 Stanford    mix
## 5 Stanford    mix
## 6 Stanford    mix

Pathway coverage

PC0 <- PC %>%
  filter(str_detect(`# Pathway`, "Gardnerella")) %>%
  as.data.frame() 
rownames(PC0) <- PC0[,1]
PC1 <- PC0 %>%
  select(-`# Pathway`)
PC2 <- PC1 %>%
  t() %>%
  as.data.frame(strinsAsFactors=FALSE) %>%
  mutate(SampleID=str_extract(colnames(PC1), "[0-9]{10}")) %>%
  gather("Pathway", "Coverage", contains("Gardnerella")) %>%
  mutate(Pathway=str_extract(Pathway, ".*(?=.g__Gardnerella)")) %>%
  left_join(gardRelativeAbundance0[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort", "Clades")], by="SampleID")
head(PC2)
##     SampleID      Pathway Coverage SubjectID TermPre GWdell GWcoll
## 1 1000801248 UNINTEGRATED        1     10008       T     41     25
## 2 1000801318 UNINTEGRATED        1     10008       T     41     32
## 3 1000801368 UNINTEGRATED        1     10008       T     41     37
## 4 1001301158 UNINTEGRATED        1     10013       P     35     16
## 5 1001301248 UNINTEGRATED        1     10013       P     35     25
## 6 1001301318 UNINTEGRATED        1     10013       P     35     31
##     Cohort Clades
## 1 Stanford    3_4
## 2 Stanford    3_4
## 3 Stanford    3_4
## 4 Stanford    mix
## 5 Stanford    mix
## 6 Stanford    mix

By Gard Clade

Gene Families

Gene Familes present in both TB and PTB

geneFamilies <- unique(paste(GF2$uniref, GF2$unirefAnnotation, sep = ": "))
GFfoldChange <- GF2 %>%
  filter(Clades=="1_2"|Clades=="3_4") %>%
  group_by(uniref, unirefAnnotation, Clades, Cohort) %>%
  summarise(mean(Abundance)) %>%
  ungroup() %>%
  spread(Clades, `mean(Abundance)`) %>%
  filter(Cohort=="UAB"|Cohort=="Stanford",
          `1_2`>0,
          `3_4`>0) %>%
  mutate(FoldChange=case_when(`1_2`>`3_4`~`1_2`/`3_4`,
                              `3_4`>`1_2`~-`3_4`/`1_2`)) #%>%
#  left_join(GFGO0, by="uniref") %>%  # for adding GO information to the dataframe. removed for now as multiple GO categories were assigned to many uniref 
#  left_join(GOnames, by= "GO")
  
labels1 <- GFfoldChange[order(GFfoldChange$FoldChange),] %>%
  select(uniref, unirefAnnotation, Cohort, FoldChange)
labels1 <- labels1[1:10,]
labels2 <- GFfoldChange[order(-GFfoldChange$FoldChange),] %>%
  select(uniref, unirefAnnotation, Cohort, FoldChange)
labels2 <- labels2[1:10,]
labels <- rbind(labels1, labels2) %>%
  filter(unirefAnnotation!="NO_NAME")

ggplot(GFfoldChange, aes(x=uniref, y=FoldChange, label = unirefAnnotation)) +
  geom_point() +
  theme(axis.text.x = element_blank()) +
  ggrepel::geom_text_repel(data=labels) +
  labs(y=element_blank()) +
  facet_wrap(~Cohort) +
  annotate("text", label = "Positive=greater in 1/2", x=200, y = 6000) 


## Gene Families with Pathway information

GFfoldChangePath <- GFfoldChange %>%
  left_join(GFP0, by=c("uniref", "unirefAnnotation")) %>%
  filter(!is.na(Pathway)) %>%
  left_join(PWnames, by="Pathway")

ggplot(GFfoldChangePath, aes(x=uniref, y=FoldChange, color = paste(Pathway, pathwayAnnotation), label=unirefAnnotation)) +
   geom_text(size=2) +
   theme(axis.text.x = element_blank()) +
   labs(y=element_blank(), color="Pathway") +
   facet_wrap(~Cohort)


List of greatest fold change uniref gene families (pathways not annotated for greatest fold change uniref genes)

GFfoldChange %>%
  filter(FoldChange>=2000|FoldChange<=-1000) %>%
  print(n = nrow(.), width = getOption("tibble.width"), n_extra = NULL)
## # A tibble: 83 x 6
##    uniref     unirefAnnotation            Cohort   `1_2`   `3_4` FoldChange
##    <chr>      <chr>                       <chr>    <dbl>   <dbl>      <dbl>
##  1 UniRef90_… UDP-N-acetylenolpyruvoylgl… Stanf… 2.99e-3 4.49e+0     -1501.
##  2 UniRef90_… Ribonuclease HI             Stanf… 3.50e-3 4.65e+0     -1325.
##  3 UniRef90_… HTH domain protein          Stanf… 3.36e-3 4.14e+0     -1233.
##  4 UniRef90_… Signal recognition particl… Stanf… 4.41e-3 4.57e+0     -1036.
##  5 UniRef90_… NO_NAME                     Stanf… 2.93e-3 4.17e+0     -1423.
##  6 UniRef90_… Drug resistance MFS transp… Stanf… 2.32e-3 4.43e+0     -1911.
##  7 UniRef90_… Universal stress family pr… Stanf… 3.83e-3 5.03e+0     -1313.
##  8 UniRef90_… Sortase family protein      Stanf… 3.58e-3 4.20e+0     -1172.
##  9 UniRef90_… Cell division protein FtsZ  Stanf… 3.15e-3 4.11e+0     -1308.
## 10 UniRef90_… Cell envelope-like functio… Stanf… 2.49e-3 3.73e+0     -1499.
## 11 UniRef90_… Mg chelatase-like protein   Stanf… 2.25e-3 3.70e+0     -1646.
## 12 UniRef90_… RNA polymerase sigma facto… Stanf… 2.54e-3 3.36e+0     -1324.
## 13 UniRef90_… NO_NAME                     Stanf… 3.61e-3 7.99e+0     -2211.
## 14 UniRef90_… Allantoate amidohydrolase   Stanf… 3.03e-3 3.38e+0     -1114.
## 15 UniRef90_… Tetratricopeptide repeat p… Stanf… 1.02e-3 1.75e+0     -1722.
## 16 UniRef90_… Aminotransferase, class I/… Stanf… 3.23e-3 3.47e+0     -1075.
## 17 UniRef90_… Na(+)/H(+) antiporter NhaA  Stanf… 6.04e-3 1.09e+1     -1799.
## 18 UniRef90_… Carbamoyl-phosphate syntha… Stanf… 3.16e-3 3.85e+0     -1218.
## 19 UniRef90_… Dihydroorotase              Stanf… 2.66e-3 3.88e+0     -1463.
## 20 UniRef90_… ABC transporter, permease … Stanf… 3.96e-3 4.24e+0     -1070.
## 21 UniRef90_… DivIVA domain protein       Stanf… 2.84e-3 4.58e+0     -1611.
## 22 UniRef90_… NO_NAME                     Stanf… 3.27e-3 3.94e+0     -1203.
## 23 UniRef90_… ABC transporter, permease … Stanf… 4.20e-3 4.94e+0     -1175.
## 24 UniRef90_… Phosphoenolpyruvate-protei… Stanf… 4.09e-3 4.49e+0     -1098.
## 25 UniRef90_… NO_NAME                     Stanf… 4.64e-3 5.17e+0     -1115.
## 26 UniRef90_… Transcriptional regulator,… Stanf… 3.87e-3 4.53e+0     -1169.
## 27 UniRef90_… Alkyl hydroperoxide reduct… Stanf… 4.40e-3 4.80e+0     -1091.
## 28 UniRef90_… Heavy metal translocating … Stanf… 3.83e-3 4.97e+0     -1297.
## 29 UniRef90_… Cell cycle protein, FtsW/R… Stanf… 5.04e-3 5.96e+0     -1182.
## 30 UniRef90_… Protein phosphatase 2C      Stanf… 4.43e-3 5.77e+0     -1303.
## 31 UniRef90_… Ribosomal RNA small subuni… Stanf… 4.49e-3 6.00e+0     -1337.
## 32 UniRef90_… Kinase domain protein       Stanf… 3.97e-3 5.49e+0     -1382.
## 33 UniRef90_… MATE efflux family protein  Stanf… 2.77e-3 4.91e+0     -1775.
## 34 UniRef90_… Chloride transporter, ClC … Stanf… 2.75e-3 5.47e+0     -1987.
## 35 UniRef90_… DNA polymerase III, subuni… Stanf… 2.98e-3 4.94e+0     -1661.
## 36 UniRef90_… Aspartate-semialdehyde deh… Stanf… 3.65e-3 4.72e+0     -1291.
## 37 UniRef90_… NO_NAME                     Stanf… 3.03e-3 4.26e+0     -1405.
## 38 UniRef90_… Efflux ABC transporter, pe… Stanf… 2.53e-3 4.60e+0     -1821.
## 39 UniRef90_… Subtilisin-like serine pro… Stanf… 2.14e-3 2.21e+0     -1033.
## 40 UniRef90_… dGTP triphosphohydrolase    Stanf… 2.96e-3 4.03e+0     -1364.
## 41 UniRef90_… NO_NAME                     Stanf… 3.52e-3 4.17e+0     -1183.
## 42 UniRef90_… NO_NAME                     Stanf… 5.78e-4 1.03e+0     -1775.
## 43 UniRef90_… Subtilisin-like serine pro… Stanf… 2.29e-3 3.48e+0     -1522.
## 44 UniRef90_… Conserved uncharacterized … Stanf… 4.17e-3 4.30e+0     -1031.
## 45 UniRef90_… Chromosome segregation ATP… Stanf… 5.64e-4 5.77e-1     -1022.
## 46 UniRef90_… NO_NAME                     Stanf… 2.63e+1 1.24e-2      2115.
## 47 UniRef90_… NO_NAME                     Stanf… 2.59e+1 1.23e-2      2109.
## 48 UniRef90_… Beta galactosidase small c… Stanf… 2.64e+1 7.29e-3      3616.
## 49 UniRef90_… Aminopeptidase P domain pr… Stanf… 2.18e+1 7.60e-3      2866.
## 50 UniRef90_… Protein phosphatase 2C      Stanf… 2.61e+1 7.11e-3      3674.
## 51 UniRef90_… Cell cycle protein, FtsW/R… Stanf… 2.59e+1 8.60e-3      3009.
## 52 UniRef90_… NO_NAME                     Stanf… 2.56e+1 9.29e-3      2758.
## 53 UniRef90_… N-acetylglucosamine-6-phos… Stanf… 2.80e+1 9.87e-3      2836.
## 54 UniRef90_… Acyltransferase             Stanf… 2.04e+1 9.61e-3      2122.
## 55 UniRef90_… Transcriptional regulator,… Stanf… 2.67e+1 1.27e-2      2095.
## 56 UniRef90_… ABC transporter, solute-bi… Stanf… 2.73e+1 1.02e-2      2691.
## 57 UniRef90_… ABC transporter, solute-bi… Stanf… 2.02e+1 8.84e-3      2283.
## 58 UniRef90_… Glycosyl hydrolase, family… Stanf… 1.34e+1 4.12e-3      3262.
## 59 UniRef90_… Actinobacterial surface-an… Stanf… 1.92e+1 8.84e-3      2170.
## 60 UniRef90_… Homoserine kinase           Stanf… 2.62e+1 1.24e-2      2108.
## 61 UniRef90_… Septum formation protein M… Stanf… 2.52e+1 8.54e-3      2949.
## 62 UniRef90_… DivIVA domain repeat prote… Stanf… 1.79e+1 7.96e-3      2252.
## 63 UniRef90_… Dihydroorotase              Stanf… 1.81e+1 8.67e-3      2085.
## 64 UniRef90_… NO_NAME                     Stanf… 1.82e+1 6.18e-3      2942.
## 65 UniRef90_… Tetratricopeptide repeat p… Stanf… 1.80e+1 6.97e-3      2579.
## 66 UniRef90_… Bifunctional purine biosyn… Stanf… 1.88e+1 7.15e-3      2623.
## 67 UniRef90_… Replicative DNA helicase    Stanf… 2.53e+1 8.27e-3      3065.
## 68 UniRef90_… NO_NAME                     Stanf… 2.61e+1 9.40e-3      2775.
## 69 UniRef90_… Transporter, CPA2 family    Stanf… 2.48e+1 1.15e-2      2168.
## 70 UniRef90_… Cyclomaltodextrinase        Stanf… 1.93e+1 6.79e-3      2847.
## 71 UniRef90_… Cell envelope-like functio… Stanf… 1.91e+1 8.40e-3      2277.
## 72 UniRef90_… Alkyl hydroperoxide reduct… Stanf… 1.87e+1 7.41e-3      2527.
## 73 UniRef90_… AMP-binding enzyme          Stanf… 1.83e+1 6.59e-3      2785.
## 74 UniRef90_… NO_NAME                     Stanf… 1.47e+1 4.79e-3      3062.
## 75 UniRef90_… NO_NAME                     Stanf… 7.73e+0 3.53e-3      2191.
## 76 UniRef90_… Primosome assembly protein… Stanf… 1.69e+1 4.77e-3      3544.
## 77 UniRef90_… FtsW-like protein           Stanf… 1.97e+1 9.29e-3      2123.
## 78 UniRef90_… Cation-transporting ATPase  Stanf… 2.67e-3 3.04e+0     -1142.
## 79 UniRef90_… Deoxyguanosinetriphosphate… Stanf… 1.95e+1 9.67e-3      2012.
## 80 UniRef90_… Dihydroorotate dehydrogena… Stanf… 3.85e-3 4.58e+0     -1188.
## 81 UniRef90_… NO_NAME                     Stanf… 7.92e+0 2.64e-3      2999.
## 82 UniRef90_… UvrD/REP helicase           Stanf… 1.62e+1 2.70e-3      6006.
## 83 UniRef90_… NO_NAME                     Stanf… 1.77e+1 8.21e-3      2162.

Gene familes only present

alternatePresence <- GF2 %>%
  group_by(uniref, unirefAnnotation, Clades, Cohort) %>%
  summarise(mean(Abundance)) %>%
  ungroup() %>%
  spread(Clades, `mean(Abundance)`) %>%
  filter(Cohort=="UAB"|Cohort=="Stanford",
         `1_2`==0|`3_4`==0,
         !(`1_2`==0&`3_4`==0)) %>%
  mutate(`3_4`=-`3_4`) %>%
  gather("Clades", "Abundance", c(`1_2`, `3_4`)) %>% #make abundance one column instead of separate T and P columsn
  filter(Abundance!=0) %>% # get rid of duplicate entries where abundance = 0  
  left_join(GFP0, by=c("uniref", "unirefAnnotation")) %>%
  left_join(PWnames, by="Pathway")

ggplot(alternatePresence, aes(x=reorder(uniref, Abundance), y=Abundance))+
  geom_col() + 
  theme(axis.text.x = element_blank()) + 
  labs(x="Uniref90", y="Average reads per million") +
  facet_wrap(~Cohort) +
  annotate("text", label = "Positive=greater in 1/2", x=300, y = 400)

# remove bulk of the figure with little variation
alternatePresence %>%
  filter(Abundance>=80|Abundance<=-30) %>%
ggplot(aes(x=reorder(uniref, Abundance), y=Abundance)) +
  geom_col() + 
  theme(axis.text.x = element_blank()) + 
  labs(x="Uniref90", y="Average reads per million") +
  facet_wrap(~Cohort) + 
  annotate("text", label = "Positive=greater in 1/2", x=3, y = 400) 


Save list of alternately present genes and show list of greatest differences

alternatePresence %>%
  filter(Abundance>=800|Abundance<=-30) %>%
  arrange(-Abundance) %>%
  print(n = nrow(.), width = getOption("tibble.width"), n_extra = NULL)
## # A tibble: 11 x 10
##    uniref unirefAnnotation Cohort `5_6`    mix  none Clades Abundance
##    <chr>  <chr>            <chr>  <dbl>  <dbl> <dbl> <chr>      <dbl>
##  1 UniRe… NO_NAME          Stanf…     0 170.       0 3_4        -32.2
##  2 UniRe… NO_NAME          Stanf…     0 491.       0 3_4        -33.8
##  3 UniRe… NO_NAME          Stanf…     0   1.81     0 3_4        -33.8
##  4 UniRe… NO_NAME          Stanf…     0  44.8      0 3_4        -35.6
##  5 UniRe… NO_NAME          Stanf…     0 126.       0 3_4        -39.6
##  6 UniRe… NO_NAME          Stanf…     0 290.       0 3_4        -40.8
##  7 UniRe… NO_NAME          Stanf…     0 234.       0 3_4        -49.2
##  8 UniRe… NO_NAME          Stanf…     0 409.       0 3_4        -57.7
##  9 UniRe… NO_NAME          Stanf…     0 412.       0 3_4        -68.7
## 10 UniRe… NO_NAME          Stanf…     0 218.       0 3_4        -74.4
## 11 UniRe… NO_NAME          Stanf…     0 499.       0 3_4        -84.9
## # … with 2 more variables: Pathway <chr>, pathwayAnnotation <chr>


Need to continure to assess gene function and pathway, and also look at distribution across samples of each of these genes.

Pathway Abundance

Assess reads per million in each annotated pathway (MetaCyc) from Gardnerella vaginalis sample across in term versus preterm birth. Cumulative reads in these pathways attributed to Gardnerella vaginalis determined by Humann2.

pathways <- unique(PA2$Pathway)
pathways
##  [1] "UNINTEGRATED"                                                                                
##  [2] "COA-PWY-1: coenzyme A biosynthesis II (mammalian)"                                           
##  [3] "NONOXIPENT-PWY: pentose phosphate pathway (non-oxidative branch)"                            
##  [4] "PENTOSE-P-PWY: pentose phosphate pathway"                                                    
##  [5] "PEPTIDOGLYCANSYN-PWY: peptidoglycan biosynthesis I (meso-diaminopimelate containing)"        
##  [6] "PWY-5100: pyruvate fermentation to acetate and lactate II"                                   
##  [7] "PWY-5686: UMP biosynthesis"                                                                  
##  [8] "PWY-6122: 5-aminoimidazole ribonucleotide biosynthesis II"                                   
##  [9] "PWY-6151: S-adenosyl-L-methionine cycle I"                                                   
## [10] "PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis"                      
## [11] "PWY-6385: peptidoglycan biosynthesis III (mycobacteria)"                                     
## [12] "PWY-6386: UDP-N-acetylmuramoyl-pentapeptide biosynthesis II (lysine-containing)"             
## [13] "PWY-6387: UDP-N-acetylmuramoyl-pentapeptide biosynthesis I (meso-diaminopimelate containing)"
## [14] "PWY-7219: adenosine ribonucleotides de novo biosynthesis"                                    
## [15] "PWY-7221: guanosine ribonucleotides de novo biosynthesis"                                    
## [16] "THRESYN-PWY: superpathway of L-threonine biosynthesis"
map(pathways, ~filter(PA2, Pathway==.x) %>% 
  ggplot(., aes(x=Clades, y=Abundance, color=Clades)) +
  geom_jitter() +
  geom_boxplot(alpha=0) +
  scale_y_log10() +
  labs(title=.x, x=element_blank(), y="Abundance (reads per million") + 
  #scale_x_discrete(labels = c("Preterm", "Term")) + 
  #scale_color_manual(values=c("#56B4E9", "#D55E00")) +  
  theme(legend.position = "none") +
  facet_wrap(~Cohort))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]


Need to find cohort information of samples of unkown cohort. Some differences apparent between Stanford and UAB cohorts. of note is that no PTB samples have reads from the peptidoglycan biosynthesis III pathway in the Stanford cohort and only 3 Stanford term samples had reads from this pathway.

Pathway Coverage

PC3 <- PC2 %>%
  filter(Pathway!="UNINTEGRATED") # covereage value of "unintegrated" is meaningless
map(pathways[2:length(pathways)], ~filter(PC3, Pathway==.x) %>% # remove unintegrated from pathways list.
  ggplot(., aes(x=Clades, y=Coverage, color=Clades)) +
  geom_jitter() +
  geom_boxplot(alpha=0) +
  labs(title=.x, x=element_blank()) + 
  #scale_x_discrete(labels = c("Preterm", "Term")) +
  ylim(0,1) +  
  #scale_color_manual(values=c("#56B4E9", "#D55E00")) +  
  theme(legend.position = "none") +
  facet_wrap(~Cohort))
## [[1]]
## Warning: Removed 25 rows containing missing values (geom_point).

## 
## [[2]]
## Warning: Removed 50 rows containing missing values (geom_point).

## 
## [[3]]
## Warning: Removed 43 rows containing missing values (geom_point).

## 
## [[4]]
## Warning: Removed 28 rows containing missing values (geom_point).

## 
## [[5]]
## Warning: Removed 23 rows containing missing values (geom_point).

## 
## [[6]]
## Warning: Removed 15 rows containing missing values (geom_point).

## 
## [[7]]
## Warning: Removed 14 rows containing missing values (geom_point).

## 
## [[8]]
## Warning: Removed 17 rows containing missing values (geom_point).

## 
## [[9]]
## Warning: Removed 14 rows containing missing values (geom_point).

## 
## [[10]]
## Warning: Removed 52 rows containing missing values (geom_point).

## 
## [[11]]
## Warning: Removed 24 rows containing missing values (geom_point).

## 
## [[12]]
## Warning: Removed 20 rows containing missing values (geom_point).

## 
## [[13]]
## Warning: Removed 23 rows containing missing values (geom_point).

## 
## [[14]]
## Warning: Removed 20 rows containing missing values (geom_point).

## 
## [[15]]
## Warning: Removed 18 rows containing missing values (geom_point).

Session Info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] phyloseq_1.26.1 ggrepel_0.8.1   forcats_0.4.0   stringr_1.4.0  
##  [5] dplyr_0.8.0.1   purrr_0.3.2     readr_1.3.1     tidyr_0.8.3    
##  [9] tibble_2.1.1    ggplot2_3.1.1   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.42.0      httr_1.4.0          jsonlite_1.6       
##  [4] splines_3.5.2       foreach_1.4.4       modelr_0.1.4       
##  [7] assertthat_0.2.1    stats4_3.5.2        cellranger_1.1.0   
## [10] yaml_2.2.0          pillar_1.3.1        backports_1.1.4    
## [13] lattice_0.20-38     glue_1.3.1          digest_0.6.18      
## [16] XVector_0.22.0      rvest_0.3.3         colorspace_1.4-1   
## [19] htmltools_0.3.6     Matrix_1.2-17       plyr_1.8.4         
## [22] pkgconfig_2.0.2     broom_0.5.2         haven_2.1.0        
## [25] zlibbioc_1.28.0     scales_1.0.0        mgcv_1.8-28        
## [28] generics_0.0.2      IRanges_2.16.0      withr_2.1.2        
## [31] BiocGenerics_0.28.0 lazyeval_0.2.2      cli_1.1.0          
## [34] survival_2.44-1.1   magrittr_1.5        crayon_1.3.4       
## [37] readxl_1.3.1        evaluate_0.13       fansi_0.4.0        
## [40] nlme_3.1-139        MASS_7.3-51.4       xml2_1.2.0         
## [43] vegan_2.5-4         tools_3.5.2         data.table_1.12.2  
## [46] hms_0.4.2           Rhdf5lib_1.4.3      S4Vectors_0.20.1   
## [49] munsell_0.5.0       cluster_2.0.8       Biostrings_2.50.2  
## [52] ade4_1.7-13         compiler_3.5.2      rlang_0.3.4        
## [55] rhdf5_2.26.2        grid_3.5.2          iterators_1.0.10   
## [58] biomformat_1.10.1   rstudioapi_0.10     igraph_1.2.4       
## [61] labeling_0.3        rmarkdown_1.12      gtable_0.3.0       
## [64] codetools_0.2-16    multtest_2.38.0     reshape2_1.4.3     
## [67] R6_2.4.0            lubridate_1.7.4     knitr_1.22         
## [70] utf8_1.1.4          permute_0.9-5       ape_5.3            
## [73] stringi_1.4.3       parallel_3.5.2      Rcpp_1.0.1         
## [76] tidyselect_0.2.5    xfun_0.6